{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from pandas.tseries.offsets import *\n",
    "# from xiao_utils import months_among, f, f1\n",
    "from xiao_utils import f"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 20297 entries, 0 to 139\n",
      "Data columns (total 32 columns):\n",
      "TR                       20157 non-null object\n",
      "brand_id                 20297 non-null int64\n",
      "car_height               20157 non-null float64\n",
      "car_length               20157 non-null float64\n",
      "car_width                20157 non-null float64\n",
      "class_id                 20297 non-null int64\n",
      "compartment              20157 non-null float64\n",
      "cylinder_number          20157 non-null float64\n",
      "department_id            20157 non-null float64\n",
      "displacement             20157 non-null float64\n",
      "driven_type_id           20157 non-null float64\n",
      "emission_standards_id    20157 non-null float64\n",
      "engine_torque            20138 non-null float64\n",
      "equipment_quality        20157 non-null float64\n",
      "front_track              20157 non-null float64\n",
      "fuel_type_id             20154 non-null float64\n",
      "gearbox_type             20157 non-null object\n",
      "if_MPV_id                20157 non-null float64\n",
      "if_charging              20157 non-null object\n",
      "if_luxurious_id          20157 non-null float64\n",
      "level_id                 19859 non-null float64\n",
      "newenergy_type_id        20157 non-null float64\n",
      "power                    20157 non-null float64\n",
      "price                    11377 non-null float64\n",
      "price_level              20157 non-null float64\n",
      "rated_passenger          20157 non-null object\n",
      "rear_track               20157 non-null float64\n",
      "sale_date                20297 non-null datetime64[ns]\n",
      "sale_quantity            20157 non-null float64\n",
      "total_quality            20157 non-null float64\n",
      "type_id                  20157 non-null float64\n",
      "wheelbase                20157 non-null float64\n",
      "dtypes: datetime64[ns](1), float64(25), int64(2), object(4)\n",
      "memory usage: 5.1+ MB\n"
     ]
    }
   ],
   "source": [
    "# 将level_id字段中的-替换为np.nan\n",
    "df = pd.read_csv('../../data/origin/[new] yancheng_train_20171226.csv', dtype={'sale_date':str}, na_values=['-'], low_memory=False)\n",
    "df['sale_date']= pd.to_datetime(df['sale_date'], format='%Y%m')\n",
    "\n",
    "# 将price_level字段转换成有序类别的类型，并用其数值填入该列。\n",
    "df['price_level'] = df['price_level'].astype('category', categories=['5WL','5-8W','8-10W','10-15W','15-20W','20-25W','25-35W','35-50W','50-75W'], ordered=True)\n",
    "df['price_level'] = df['price_level'].cat.codes\n",
    "\n",
    "# 待选方案：先把power和扭矩字段带/的行复制一份，然后将新行里的销量清零，将原行和新行的power和扭矩字段的值分别赋为slash前后的值。\n",
    "# 现行方案：先他娘的直接把slash和后面的值删掉。省得影响记录条数相关的统计量。\n",
    "def process_power_and_torque(s):\n",
    "    return s.split('/')[0]\n",
    "df['power'] = df['power'].astype(str).apply(process_power_and_torque).astype(float) #[18600]\n",
    "df['engine_torque'] = df['engine_torque'].astype(str).apply(process_power_and_torque).astype(float)\n",
    "\n",
    "# -------------------------------------------------------------\n",
    "# 把2017年11月的数据拼进来，一块填入其特征，用于最终输出要提交的结果。\n",
    "empty_Nov = pd.read_csv('../../data/origin/yancheng_testA_20171225.csv', dtype={'predict_date':str}, na_values=['-'], low_memory=False)\n",
    "empty_Nov['predict_date']= pd.to_datetime(empty_Nov['predict_date'], format='%Y%m')\n",
    "empty_Nov.rename(columns = {'predict_date': 'sale_date', 'predict_quantity':'sale_quantity'}, inplace = True)\n",
    "\n",
    "\n",
    "# 读取玩了，先不急着拼，先把车型到品牌的映射关系join进来\n",
    "class_to_brand = df[['class_id','brand_id']].groupby(['class_id']).mean().reset_index()\n",
    "empyt_Nov = pd.merge(left=empty_Nov, right=class_to_brand, on='class_id', how='left')\n",
    "empty_Nov['brand_id']= class_to_brand['brand_id']\n",
    "# empty_Nov\n",
    "# class_to_brand\n",
    "\n",
    "# class_to_brand\n",
    "\n",
    "# 读取完了，拼上去\n",
    "df = pd.concat([df, empty_Nov])\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "%qtconsole"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "9. 过去几年中，当前月的总销量占当年总销量的比例\n",
    "1. 过去几年中，单月各车型总销量"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 该类型特征的 column name 都以 T_ 开头，意为Time（时间）\n",
    "#### 单纯的时间信息\n",
    "1. 年\n",
    "2. 月\n",
    "3. 年、月的Onehot\n",
    "4. 月份相关的【还未加入】\n",
    "    4. 当月有几个周六\n",
    "    5. 当月有几个周日\n",
    "    6. 当月有多少天\n",
    "    7. 当月周六+周日一共有多少天\n",
    "    8. 当月周末占全月天数的比例（即上一条除以上上一条）\n",
    "    10. 当月有多少天假期\n",
    "9. 当月是哪个季节？？【还未加入】\n",
    "1. 当月是上半年还是下半年？？【还未加入】\n",
    "\n",
    "#### 历史销量信息（不区分车型/品牌）~~【还未加入】~~\n",
    "1. 过去5年，单月各车型总销量【T_som_i, Sale of One Month】\n",
    "3. 从去年开始往前看，当前月对应月份的销量在当年总销量中的比例（如当前行的sale_date=201710，则这个特征计算的是201610的总销量在2016年全年销量中的占比）【还未加入】\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['TR', 'brand_id', 'car_height', 'car_length', 'car_width', 'class_id',\n",
       "       'compartment', 'cylinder_number', 'department_id', 'displacement',\n",
       "       'driven_type_id', 'emission_standards_id', 'engine_torque',\n",
       "       'equipment_quality', 'front_track', 'fuel_type_id', 'gearbox_type',\n",
       "       'if_MPV_id', 'if_charging', 'if_luxurious_id', 'level_id',\n",
       "       'newenergy_type_id', 'power', 'price', 'price_level', 'rated_passenger',\n",
       "       'rear_track', 'sale_date', 'sale_quantity', 'total_quality', 'type_id',\n",
       "       'wheelbase'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "DatetimeIndex: 71 entries, 2012-01-01 to 2017-11-01\n",
      "Data columns (total 1 columns):\n",
      "T_som_0    70 non-null float64\n",
      "dtypes: float64(1)\n",
      "memory usage: 1.1 KB\n"
     ]
    }
   ],
   "source": [
    "g_date = df[['sale_date','sale_quantity']].groupby('sale_date').sum()\n",
    "g_date = g_date.rename(columns={'sale_quantity':'T_som_0'})\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>T_som_0</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sale_date</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2012-01-01</th>\n",
       "      <td>22927.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2012-02-01</th>\n",
       "      <td>14433.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2012-03-01</th>\n",
       "      <td>15587.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2012-04-01</th>\n",
       "      <td>13438.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2012-05-01</th>\n",
       "      <td>18195.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            T_som_0\n",
       "sale_date          \n",
       "2012-01-01  22927.0\n",
       "2012-02-01  14433.0\n",
       "2012-03-01  15587.0\n",
       "2012-04-01  13438.0\n",
       "2012-05-01  18195.0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "g_date.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 20297 entries, 0 to 139\n",
      "Data columns (total 53 columns):\n",
      "TR                       20157 non-null object\n",
      "brand_id                 20297 non-null int64\n",
      "car_height               20157 non-null float64\n",
      "car_length               20157 non-null float64\n",
      "car_width                20157 non-null float64\n",
      "class_id                 20297 non-null int64\n",
      "compartment              20157 non-null float64\n",
      "cylinder_number          20157 non-null float64\n",
      "department_id            20157 non-null float64\n",
      "displacement             20157 non-null float64\n",
      "driven_type_id           20157 non-null float64\n",
      "emission_standards_id    20157 non-null float64\n",
      "engine_torque            20138 non-null float64\n",
      "equipment_quality        20157 non-null float64\n",
      "front_track              20157 non-null float64\n",
      "fuel_type_id             20154 non-null float64\n",
      "gearbox_type             20157 non-null object\n",
      "if_MPV_id                20157 non-null float64\n",
      "if_charging              20157 non-null object\n",
      "if_luxurious_id          20157 non-null float64\n",
      "level_id                 19859 non-null float64\n",
      "newenergy_type_id        20157 non-null float64\n",
      "power                    20157 non-null float64\n",
      "price                    11377 non-null float64\n",
      "price_level              20157 non-null float64\n",
      "rated_passenger          20157 non-null object\n",
      "rear_track               20157 non-null float64\n",
      "sale_date                20297 non-null datetime64[ns]\n",
      "sale_quantity            20157 non-null float64\n",
      "total_quality            20157 non-null float64\n",
      "type_id                  20157 non-null float64\n",
      "wheelbase                20157 non-null float64\n",
      "year                     20297 non-null int64\n",
      "month                    20297 non-null int64\n",
      "is_leap_year             20297 non-null bool\n",
      "year_oh_2012             20297 non-null uint8\n",
      "year_oh_2013             20297 non-null uint8\n",
      "year_oh_2014             20297 non-null uint8\n",
      "year_oh_2015             20297 non-null uint8\n",
      "year_oh_2016             20297 non-null uint8\n",
      "year_oh_2017             20297 non-null uint8\n",
      "month_oh_1               20297 non-null uint8\n",
      "month_oh_2               20297 non-null uint8\n",
      "month_oh_3               20297 non-null uint8\n",
      "month_oh_4               20297 non-null uint8\n",
      "month_oh_5               20297 non-null uint8\n",
      "month_oh_6               20297 non-null uint8\n",
      "month_oh_7               20297 non-null uint8\n",
      "month_oh_8               20297 non-null uint8\n",
      "month_oh_9               20297 non-null uint8\n",
      "month_oh_10              20297 non-null uint8\n",
      "month_oh_11              20297 non-null uint8\n",
      "month_oh_12              20297 non-null uint8\n",
      "dtypes: bool(1), datetime64[ns](1), float64(25), int64(4), object(4), uint8(18)\n",
      "memory usage: 5.8+ MB\n"
     ]
    }
   ],
   "source": [
    "# 单纯时间信息\n",
    "df['year'] = df['sale_date'].apply(lambda x:x.year)\n",
    "df['month'] = df['sale_date'].apply(lambda x:x.month)\n",
    "df['is_leap_year'] = df['sale_date'].apply(lambda x:x.is_leap_year)\n",
    "df['year_oh'] = df['year']\n",
    "df['month_oh'] = df['month']\n",
    "df = pd.get_dummies(df, columns=['year_oh','month_oh'])\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "T_features = pd.merge(df, g_date, how='left', left_on='sale_date', right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "T_features.to_csv(\"../../data/features/T_features.csv\",index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 历史销量信息\n",
    "# 主要逻辑\n",
    "def calc_sale_features_on_time(df):\n",
    "    \"\"\"\n",
    "    Args:\n",
    "        df: 完整的数据集\n",
    "    Return:\n",
    "        tmp：基于历史销量（不分品牌和车型）信息，构造出的特征们\n",
    "    \"\"\"\n",
    "#     g_date = df.groupby(['class_id','sale_date']).sum().reset_index()[['class_id','sale_date','sale_quantity']]\n",
    "#     gg = g_date.groupby('class_id')\n",
    "    \n",
    "    g_date = df[['sale_date','sale_quantity']].groupby('sale_date').sum()\n",
    "    \n",
    "\n",
    "    # 过去几年内的每个月销量\n",
    "    tmp = g_cls_date\n",
    "    for i in range(61):\n",
    "        tmp['sale_of_month_' + str(i+1) + '_ago'] = gg.apply(f, -(i+1), -i).reset_index()['sale_date']\n",
    "\n",
    "\n",
    "    # 该车型过去2~60个月分别的销量和\n",
    "    tmp['sum_sale_of_last_1_month'] = tmp['sale_of_month_1_ago']\n",
    "    for i in range(60):\n",
    "        # tmp['sum_sale_of_last_' + str(i+1) + '_month'] = gg.apply(f, -(i+1), 0).reset_index()['sale_date']\n",
    "        tmp['sum_sale_of_last_' + str(i+2) + '_month'] = tmp['sum_sale_of_last_' + str(i+1) + '_month'] + \\\n",
    "                                                            tmp['sale_of_month_' + str(i+2) + '_ago']\n",
    "    tmp = tmp.drop('sum_sale_of_last_1_month', axis=1) # 再把这一列删掉，因为和前面的 sale_of_month_1_ago 列是重复的\n",
    "    \n",
    "    # 一阶差分\n",
    "    for i in range(60):\n",
    "        thismonth = tmp['sale_of_month_' + str((i+1)*12) + '_ago']\n",
    "        lastmonth = tmp['sale_of_month_' + str((i+1)*12+1) + '_ago'] # gg.apply(f, -(i+1)*12-1, -(i+1)*12).reset_index()['sale_date']\n",
    "        tmp['rate_of_this_month_divby_last_month_' + str(i+1) + '_year_ago'] = thismonth / lastmonth\n",
    "        tmp['diff_of_this_month_sub_last_month_' + str(i+1) + '_year_ago'] = thismonth - lastmonth\n",
    "\n",
    "    # 该车型往年这个月比上个月的销量比值\n",
    "    # 该车型往年这个月减去上个月的销量差值\n",
    "    for i in range(3): # 只看过去三年的\n",
    "        thismonth = tmp['sale_of_month_' + str((i+1)*12) + '_ago']\n",
    "        lastmonth = tmp['sale_of_month_' + str((i+1)*12+1) + '_ago'] # gg.apply(f, -(i+1)*12-1, -(i+1)*12).reset_index()['sale_date']\n",
    "        tmp['rate_of_this_month_divby_last_month_' + str(i+1) + '_year_ago'] = thismonth / lastmonth\n",
    "        tmp['diff_of_this_month_sub_last_month_' + str(i+1) + '_year_ago'] = thismonth - lastmonth\n",
    "\n",
    "    # 该车型上个月比上上个月的比值\n",
    "    thisyear_lastmonth = tmp['sale_of_month_1_ago']\n",
    "    thisyear_lastlastmonth = tmp['sale_of_month_2_ago']\n",
    "    tmp['rate_of_last_divby_lastlast'] = thisyear_lastmonth / thisyear_lastlastmonth\n",
    "    # 该车型上个月减去上上个月的差值\n",
    "    tmp['diff_of_last_sub_lastlast'] = thisyear_lastmonth - thisyear_lastlastmonth\n",
    "\n",
    "    # 注意要把np.inf替换为空值，在上面算月销量比例时，引入了inf，其实应该作为空值。\n",
    "    return tmp.replace([np.inf, -np.inf], np.nan)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
